This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_500/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_500/notebooks/PE_500_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Chryseobacterium sp. | Chryseobacterium sp. 3008163 | Chryseobacterium sp. 6424 | Chryseobacterium sp. 7-3A | Chryseobacterium sp. IHB B 17019 | Chryseobacterium sp. POL2 | Chryseobacterium sp. T16E-39 | ... | Xenorhabdus nematophila | Yoonia vestfoldensis | Youhaiella tibetensis | Bathymodiolus thermophilus thioautotrophic gill symbiont | Ndongobacter massiliensis | Phreatobacter stygius | Pseudohongiella spirulinae | Rickettsiales endosymbiont of Stachyamoeba lipophora | endosymbiont of unidentified scaly snail isolate Monju | Apis mellifera filamentous virus | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 1564 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Chryseobacterium sp. |
| 2 | Chryseobacterium sp. 3008163 |
| 3 | Chryseobacterium sp. 6424 |
| 4 | Chryseobacterium sp. 7-3A |
| ... | ... |
| 1557 | Phreatobacter stygius |
| 1558 | Pseudohongiella spirulinae |
| 1559 | Rickettsiales endosymbiont of Stachyamoeba lip... |
| 1560 | endosymbiont of unidentified scaly snail isola... |
| 1561 | Apis mellifera filamentous virus |
1562 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b1fcd9413d0>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b20046dadd0>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
0
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 1562)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 1562) (384, 57) (1562, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_1K/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_1K/notebooks/PE_1K_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Blochmannia endosymbiont of Camponotus (Colobopsis) obliquus | Blochmannia endosymbiont of Camponotus nipponensis | Blochmannia endosymbiont of Polyrhachis (Hedomyrma) turneri | Chryseobacterium sp. | Chryseobacterium sp. 3008163 | Chryseobacterium sp. F5649 | Chryseobacterium sp. G0186 | ... | Thiolapillus brandeum | endosymbiont of Pachyrhynchus infernalis | Candidatus Cloacimonas acidaminovorans | Thermobaculum terrenum | endosymbiont 'TC1' of Trimyema compressum | endosymbiont of unidentified scaly snail isolate Monju | Phormidium phage MIS-PhV1A | Pyrococcus abyssi virus 1 | White spot syndrome virus | Human feces pecovirus | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 2301 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Blochmannia endosymbiont of Camponotus (Colobo... |
| 2 | Blochmannia endosymbiont of Camponotus nippone... |
| 3 | Blochmannia endosymbiont of Polyrhachis (Hedom... |
| 4 | Chryseobacterium sp. |
| ... | ... |
| 2294 | endosymbiont of unidentified scaly snail isola... |
| 2295 | Phormidium phage MIS-PhV1A |
| 2296 | Pyrococcus abyssi virus 1 |
| 2297 | White spot syndrome virus |
| 2298 | Human feces pecovirus |
2299 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2ab9f0ba74d0>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2ab9f3cf4090>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
0
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 2299)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 2299) (384, 57) (2299, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_5K/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_5K/notebooks/PE_5K_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Blochmannia endosymbiont of Camponotus (Colobopsis) obliquus | Blochmannia endosymbiont of Camponotus nipponensis | Blochmannia endosymbiont of Polyrhachis (Hedomyrma) turneri | Candidatus Aquiluna sp. UB-MaderosW2red | Chryseobacterium sp. | Chryseobacterium sp. 3008163 | Chryseobacterium sp. 6424 | ... | Conexivisphaera calidus | Pandoravirus dulcis | Pandoravirus inopinatum | Pandoravirus quercus | Thermobaculum terrenum | endosymbiont 'TC1' of Trimyema compressum | endosymbiont of unidentified scaly snail isolate Monju | Diatraea saccharalis granulovirus | Salterprovirus His1 | Human feces pecovirus | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 4624 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Blochmannia endosymbiont of Camponotus (Colobo... |
| 2 | Blochmannia endosymbiont of Camponotus nippone... |
| 3 | Blochmannia endosymbiont of Polyrhachis (Hedom... |
| 4 | Candidatus Aquiluna sp. UB-MaderosW2red |
| ... | ... |
| 4617 | endosymbiont 'TC1' of Trimyema compressum |
| 4618 | endosymbiont of unidentified scaly snail isola... |
| 4619 | Diatraea saccharalis granulovirus |
| 4620 | Salterprovirus His1 |
| 4621 | Human feces pecovirus |
4622 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2ac679ef0190>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2ac67dad1fd0>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
0
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 4622)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 4622) (384, 57) (4622, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_10K/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_10K/notebooks/PE_10K_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Blochmannia endosymbiont of Camponotus (Colobopsis) obliquus | Blochmannia endosymbiont of Camponotus nipponensis | Blochmannia endosymbiont of Colobopsis nipponica | Blochmannia endosymbiont of Polyrhachis (Hedomyrma) turneri | Candidatus Aquiluna sp. 15G-AUS-rot | Candidatus Aquiluna sp. UB-MaderosW2red | Chryseobacterium sp. | ... | Apis mellifera filamentous virus | Beihai tombus-like virus 19 | Beihai weivirus-like virus 2 | Hubei odonate virus 7 | Hubei picorna-like virus 52 | Phormidium phage MIS-PhV1A | Wuhan insect virus 13 | Cotesia congregata bracovirus | Heliothis zea nudivirus | Human feces pecovirus | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 5462 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Blochmannia endosymbiont of Camponotus (Colobo... |
| 2 | Blochmannia endosymbiont of Camponotus nippone... |
| 3 | Blochmannia endosymbiont of Colobopsis nipponica |
| 4 | Blochmannia endosymbiont of Polyrhachis (Hedom... |
| ... | ... |
| 5455 | Phormidium phage MIS-PhV1A |
| 5456 | Wuhan insect virus 13 |
| 5457 | Cotesia congregata bracovirus |
| 5458 | Heliothis zea nudivirus |
| 5459 | Human feces pecovirus |
5460 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2aabb15a1050>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2aabae6b5750>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
0
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 5460)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 5460) (384, 57) (5460, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_25K/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_25K/notebooks/PE_25K_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Alphamesonivirus 4 | Blochmannia endosymbiont of Camponotus (Colobopsis) obliquus | Blochmannia endosymbiont of Camponotus nipponensis | Blochmannia endosymbiont of Colobopsis nipponica | Blochmannia endosymbiont of Polyrhachis (Hedomyrma) turneri | Candidatus Aquiluna sp. 15G-AUS-rot | Candidatus Aquiluna sp. UB-MaderosW2red | ... | Cotesia congregata bracovirus | Diatraea saccharalis granulovirus | Ectropis obliqua nucleopolyhedrovirus | Epiphyas postvittana nucleopolyhedrovirus | Glossina hytrosavirus | Spodoptera frugiperda granulovirus | Sulfolobales Mexican fusellovirus 1 | Sulfolobus monocaudavirus SMV4 | White spot syndrome virus | Human feces pecovirus | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 6146 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Alphamesonivirus 4 |
| 2 | Blochmannia endosymbiont of Camponotus (Colobo... |
| 3 | Blochmannia endosymbiont of Camponotus nippone... |
| 4 | Blochmannia endosymbiont of Colobopsis nipponica |
| ... | ... |
| 6139 | Spodoptera frugiperda granulovirus |
| 6140 | Sulfolobales Mexican fusellovirus 1 |
| 6141 | Sulfolobus monocaudavirus SMV4 |
| 6142 | White spot syndrome virus |
| 6143 | Human feces pecovirus |
6144 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b0837791090>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b083bb81950>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
3609
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 2535)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 2535) (384, 57) (2535, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_50K/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_50K/notebooks/PE_50K_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Blochmannia endosymbiont of Camponotus (Colobopsis) obliquus | Blochmannia endosymbiont of Camponotus nipponensis | Blochmannia endosymbiont of Colobopsis nipponica | Blochmannia endosymbiont of Polyrhachis (Hedomyrma) turneri | Candidatus Aquiluna sp. 15G-AUS-rot | Candidatus Aquiluna sp. UB-MaderosW2red | Chryseobacterium sp. | ... | Helicoverpa armigera nucleopolyhedrovirus | Mamestra brassicae multiple nucleopolyhedrovirus | Mythimna unipuncta nucleopolyhedrovirus | Spodoptera exigua multiple nucleopolyhedrovirus | Thermococcus prieurii virus 1 | Thysanoplusia orichalcea nucleopolyhedrovirus | Tomelloso virus | Human feces pecovirus | Mollivirus sibericum | Pacmanvirus A23 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | 0.00 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 6485 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Blochmannia endosymbiont of Camponotus (Colobo... |
| 2 | Blochmannia endosymbiont of Camponotus nippone... |
| 3 | Blochmannia endosymbiont of Colobopsis nipponica |
| 4 | Blochmannia endosymbiont of Polyrhachis (Hedom... |
| ... | ... |
| 6478 | Thysanoplusia orichalcea nucleopolyhedrovirus |
| 6479 | Tomelloso virus |
| 6480 | Human feces pecovirus |
| 6481 | Mollivirus sibericum |
| 6482 | Pacmanvirus A23 |
6483 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2ba6b6f87fd0>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2ba6b6f76ad0>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
4890
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 1593)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 1593) (384, 57) (1593, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_100K/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_100K/notebooks/PE_100K_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Brassica napus | Alphacoronavirus 1 | Blochmannia endosymbiont of Camponotus (Colobopsis) obliquus | Blochmannia endosymbiont of Camponotus nipponensis | Blochmannia endosymbiont of Colobopsis nipponica | Blochmannia endosymbiont of Polyrhachis (Hedomyrma) turneri | Candidatus Aquiluna sp. 15G-AUS-rot | ... | Sulfolobales Mexican fusellovirus 1 | Sulfolobus monocaudavirus SMV4 | Sulfolobus virus STSV2 | Thermoproteus tenax spherical virus 1 | Tomelloso virus | Halovirus VNH-1 | Human feces pecovirus | Leptopilina boulardi filamentous virus | Mollivirus sibericum | Pacmanvirus A23 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 6785 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Brassica napus |
| 2 | Alphacoronavirus 1 |
| 3 | Blochmannia endosymbiont of Camponotus (Colobo... |
| 4 | Blochmannia endosymbiont of Camponotus nippone... |
| ... | ... |
| 6778 | Halovirus VNH-1 |
| 6779 | Human feces pecovirus |
| 6780 | Leptopilina boulardi filamentous virus |
| 6781 | Mollivirus sibericum |
| 6782 | Pacmanvirus A23 |
6783 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b2ea53edd10>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b2ea93e48d0>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
5682
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 1101)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 1101) (384, 57) (1101, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_500K/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_500K/notebooks/PE_500K_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Alphamesonivirus 4 | Alphamesonivirus 6 | Avian coronavirus | Blochmannia endosymbiont of Camponotus (Colobopsis) obliquus | Blochmannia endosymbiont of Camponotus nipponensis | Blochmannia endosymbiont of Colobopsis nipponica | Blochmannia endosymbiont of Polyrhachis (Hedomyrma) turneri | ... | White spot syndrome virus | bacterium | Aedes camptorhynchus reo-like virus | Daeseongdong virus 1 | Human feces pecovirus | Leptopilina boulardi filamentous virus | Mollivirus sibericum | Pacmanvirus A23 | Sewage-associated circular DNA virus-4 | uncultured Mediterranean phage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | 0.01 | NaN | NaN | NaN | 0.0 | NaN | 0.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.02 | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 7545 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Alphamesonivirus 4 |
| 2 | Alphamesonivirus 6 |
| 3 | Avian coronavirus |
| 4 | Blochmannia endosymbiont of Camponotus (Colobo... |
| ... | ... |
| 7538 | Leptopilina boulardi filamentous virus |
| 7539 | Mollivirus sibericum |
| 7540 | Pacmanvirus A23 |
| 7541 | Sewage-associated circular DNA virus-4 |
| 7542 | uncultured Mediterranean phage |
7543 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2af5441eb7d0>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2af546f118d0>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
6729
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 814)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 814) (384, 57) (814, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_1M/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_1M/notebooks/PE_1M_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Brassica napus | Coronavirus AcCoV-JC34 | Alphamesonivirus 1 | Alphamesonivirus 4 | Alphamesonivirus 6 | Avian coronavirus | Blochmannia endosymbiont of Camponotus (Colobopsis) obliquus | ... | bacterium | Halovirus VNH-1 | Human feces pecovirus | Leptopilina boulardi filamentous virus | Mollivirus sibericum | Pacmanvirus A23 | Posavirus sp. | Sewage-associated circular DNA virus-4 | Zirqa virus | uncultured Mediterranean phage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 7979 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Brassica napus |
| 2 | Coronavirus AcCoV-JC34 |
| 3 | Alphamesonivirus 1 |
| 4 | Alphamesonivirus 4 |
| ... | ... |
| 7972 | Pacmanvirus A23 |
| 7973 | Posavirus sp. |
| 7974 | Sewage-associated circular DNA virus-4 |
| 7975 | Zirqa virus |
| 7976 | uncultured Mediterranean phage |
7977 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2ae8a1627850>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2ae8a401a9d0>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
7175
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 802)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 802) (384, 57) (802, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_5M/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_5M/notebooks/PE_5M_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Brassica napus | BtRf-AlphaCoV/YN2012 | Coronavirus AcCoV-JC34 | Alphamesonivirus 1 | Alphamesonivirus 2 | Alphamesonivirus 4 | Alphamesonivirus 6 | ... | Lake Sarah-associated circular virus-15 | Leptopilina boulardi filamentous virus | Mollivirus sibericum | Pacmanvirus A23 | Posavirus sp. | Rasavirus sp. | Rudphi virus 5 | Sewage-associated circular DNA virus-4 | Zirqa virus | uncultured Mediterranean phage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 8967 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Brassica napus |
| 2 | BtRf-AlphaCoV/YN2012 |
| 3 | Coronavirus AcCoV-JC34 |
| 4 | Alphamesonivirus 1 |
| ... | ... |
| 8960 | Rasavirus sp. |
| 8961 | Rudphi virus 5 |
| 8962 | Sewage-associated circular DNA virus-4 |
| 8963 | Zirqa virus |
| 8964 | uncultured Mediterranean phage |
8965 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b42e25a53d0>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b42ad7c7190>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
8181
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 784)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 784) (384, 57) (784, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here, we select all samples with complete clincal data with respect to sex.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
inp_mat_file = '/project/data/raw/justin_paired_end_run_jul_6_2021/PE_50K_freq_mat.csv'
metadata_file = '/project/data/raw/Jie_2017_supp_table_1.xlsx'
identifiers_file = '/project/data/raw/SRA/all_runs.csv'
output_dir = '/project/data/preprocessed/PE_50K_sex_complete'
overwrite = True
# Parameters
inp_mat_file = "results/kraken2_PE_10M/matrix/freq_mat.csv"
metadata_file = "data/raw/Jie_2017_supp_table_1.xlsx"
identifiers_file = "data/raw/SRA/all_runs.csv"
output_dir = "results/kraken2_PE_10M/notebooks/PE_10M_Sex/prepped_data"
overwrite = True
def read_table(fpath, **kwargs):
"""
read csv or excel file
fpath = filepath
**kwargs = other arguments to pandas.read_csv or pandas.read_excel
returns: pandas dataframe
"""
if re.search('\\.xls(x)*$', fpath):
df = pd.read_excel(fpath, **kwargs)
elif re.search('\\.csv$', fpath):
df = pd.read_csv(fpath, **kwargs)
else:
raise ValueError('Unexpected file type for file {}'.format(fpath))
return df
inp_mat = read_table(inp_mat_file)
matrix_cols = inp_mat.columns[1:len(inp_mat.columns)]
inp_mat['RunID'] = np.array([re.findall('^[A-Z]{3}[0-9]+', x)[0] for x in inp_mat['filename'].to_numpy().astype('str')])
inp_mat = inp_mat.loc[:, np.concatenate((['filename', 'RunID'], matrix_cols.to_numpy()))]
inp_mat.head()
| filename | RunID | Homo sapiens | Brassica napus | BtRf-AlphaCoV/YN2012 | Coronavirus AcCoV-JC34 | Alphamesonivirus 1 | Alphamesonivirus 2 | Alphamesonivirus 4 | Alphamesonivirus 6 | ... | Pacmanvirus A23 | Posavirus sp. | Rasavirus sp. | Rudphi virus 5 | Sewage-associated circular DNA virus-4 | Thika virus | Tunis virus | Wabat virus | Zirqa virus | uncultured Mediterranean phage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ERR2017411_seqt.subsampled_1_kneaddata_paired_... | ERR2017411 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ERR2017412_seqt.subsampled_1_kneaddata_paired_... | ERR2017412 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ERR2017413_seqt.subsampled_1_kneaddata_paired_... | ERR2017413 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ERR2017414_seqt.subsampled_1_kneaddata_paired_... | ERR2017414 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ERR2017415_seqt.subsampled_1_kneaddata_paired_... | ERR2017415 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 9370 columns
meta_data = read_table(metadata_file, skiprows=2)
meta_data.head()
| Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | Body Mass Index (BMI) | Waist (cm) | Hip (cm) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | 23.88 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | 24.61 | 78.0 | 90.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | 25.40 | 92.0 | 98.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | 23.03 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | 26.67 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 55 columns
meta_data['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL525
401 ZSL527
402 ZSL529
403 ZSL530
404 ZSL532
Name: Sample ID, Length: 405, dtype: object
identifiers = read_table(identifiers_file)
identifiers['Sample ID'] = identifiers.pop('Alias')
identifiers.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | |
|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 |
identifiers['Sample ID']
0 N001
1 N002
2 N003
3 N005
4 N006
...
400 ZSL_58
401 ZSL_67
402 ZSL_6
403 ZSL_82
404 ZSL_96
Name: Sample ID, Length: 405, dtype: object
identifiers['Sample ID'] = identifiers['Sample ID'].astype('str')
meta_data['Sample ID'] = meta_data['Sample ID'].astype('str')
meta_data_full = pd.merge(identifiers, meta_data, how='left', on='Sample ID')
meta_data_full.head()
| Unnamed: 0 | ExptAcc | RunID | Sample ID | ACVD status | Coronary type | Age (year) | Gender | Height (cm) | Weight (kg) | ... | Bisoprolol_9 | Fondaparinux Sodium_10 | Metoprolol_19 | Insulin aspart_6 | Isosorbide dinitrate_10 | Acarbose_7 | Captopril Tablets_6 | Estazolam_6 | Nitroglycerin Tablets_8 | Potassium chloride_32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ERX2076993 | ERR2017411 | N001 | 0 | NaN | 52.0 | male | 165.0 | 65.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0 | ERX2076994 | ERR2017412 | N002 | 0 | NaN | 53.0 | male | 160.0 | 63.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 0 | ERX2076995 | ERR2017413 | N003 | 0 | NaN | 48.0 | male | 166.0 | 70.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 0 | ERX2076996 | ERR2017414 | N005 | 0 | NaN | 53.0 | female | 158.0 | 57.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | ERX2076997 | ERR2017415 | N006 | 0 | NaN | 53.0 | female | 150.0 | 60.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 58 columns
meta_data_full.shape
(405, 58)
meta_data_full = meta_data_full.set_index('RunID')
meta_data_full = meta_data_full.loc[inp_mat['RunID'], :]
inp_mat = inp_mat.set_index('RunID')
# pop filenames from data frame
inp_mat.pop('filename')
RunID
ERR2017411 ERR2017411_seqt.subsampled_1_kneaddata_paired_...
ERR2017412 ERR2017412_seqt.subsampled_1_kneaddata_paired_...
ERR2017413 ERR2017413_seqt.subsampled_1_kneaddata_paired_...
ERR2017414 ERR2017414_seqt.subsampled_1_kneaddata_paired_...
ERR2017415 ERR2017415_seqt.subsampled_1_kneaddata_paired_...
...
ERR2017809 ERR2017809_seqt.subsampled_1_kneaddata_paired_...
ERR2017810 ERR2017810_seqt.subsampled_1_kneaddata_paired_...
ERR2017813 ERR2017813_seqt.subsampled_1_kneaddata_paired_...
ERR2017814 ERR2017814_seqt.subsampled_1_kneaddata_paired_...
ERR2017815 ERR2017815_seqt.subsampled_1_kneaddata_paired_...
Name: filename, Length: 384, dtype: object
feat_meta = pd.DataFrame({'feature': inp_mat.columns.to_numpy().astype('str')})
feat_meta['feature'] = [re.sub('^( )*', '', feat_meta['feature'][i]) for i in range(len(feat_meta['feature']))]
feat_meta
| feature | |
|---|---|
| 0 | Homo sapiens |
| 1 | Brassica napus |
| 2 | BtRf-AlphaCoV/YN2012 |
| 3 | Coronavirus AcCoV-JC34 |
| 4 | Alphamesonivirus 1 |
| ... | ... |
| 9363 | Thika virus |
| 9364 | Tunis virus |
| 9365 | Wabat virus |
| 9366 | Zirqa virus |
| 9367 | uncultured Mediterranean phage |
9368 rows × 1 columns
X = inp_mat.to_numpy().astype('float32')
X[np.isnan(X)] = 0.
zero_feats = [np.sum(X[i,:] == 0.) for i in range(X.shape[0])]
sns.displot(zero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b82ea4c2110>
nonzero_feats = [np.sum(X[i,:] != 0.) for i in range(X.shape[0])]
sns.displot(nonzero_feats)
<seaborn.axisgrid.FacetGrid at 0x2b82e9f39d90>
X_centered = X - (X.sum(axis=0)/X.shape[1])
col_vars = np.power(X_centered, 2).sum(axis=0)/X.shape[1]
sum(col_vars == 0)
8583
feats_keep = col_vars != 0
X = X[:, feats_keep]
inp_mat = inp_mat.iloc[:, feats_keep]
feat_meta = feat_meta.iloc[feats_keep, :]
X.shape
(384, 785)
# check that all values of x are gte 0
assert np.all(X >= 0)
# these samples, if they exist, have 0 detected features
zero_samples = X.sum(axis=1) == 0
# if this is True, then we'd need to remove some samples
any_zero_samples = np.any(zero_samples)
any_zero_samples
False
# if any_zero_samples:
# samples_keep = np.logical_not(zero_samples)
# X = X[samples_keep, :]
# meta_data_full = meta_data_full.iloc[samples_keep, :]
print(X.shape)
print(meta_data_full.shape)
print(feat_meta.shape)
(384, 785) (384, 57) (785, 1)
meta_data_full.columns
Index(['Unnamed: 0', 'ExptAcc', 'Sample ID', 'ACVD status', 'Coronary type',
'Age (year)', 'Gender', 'Height (cm)', 'Weight (kg)',
'Body Mass Index (BMI)', 'Waist (cm)', 'Hip (cm)',
'Waist Hip Ratio (WHR)', 'Smoking', 'Systolic Blood Pressure (mmHg)',
'Diastolic Blood Pressure (mmHg)', 'FBG', 'TRIG (mmol/L)',
'LDLC (mmol/L)', 'CHOL (mmol/L)', 'HDLC (mmol/L)', 'APOB (mg/dl)',
'APOA (mg/dl)', 'Lpa (mg/dl)', 'HbA1c (%)', 'CKMB (U/L)', 'ALB (U/L)',
'ALT (U/L)', 'TP (g/L)', 'AST (U/L)', 'CREA (umol/L)', 'CK (U/L)',
'ProBNP E (Pg/ml)', 'CRP (mg/l)', 'HBDH (U/L)', 'DBIL (umol/L)',
'BUN (nmol/L)', 'TBIL (umol/L)', 'URIC (umol/L)',
'Clopidogrel Hydrogen Sulphate Tablets_117', 'Aspirin_33',
'Atorvastatin_25', 'Esomeprazole_9', 'Isosorbide Mononitrate_11',
'Potassium Citrate_14', 'Perindopril_15', 'Heparin Sodium_10',
'Bisoprolol_9', 'Fondaparinux Sodium_10', 'Metoprolol_19',
'Insulin aspart_6', 'Isosorbide dinitrate_10', 'Acarbose_7',
'Captopril Tablets_6', 'Estazolam_6', 'Nitroglycerin Tablets_8',
'Potassium chloride_32'],
dtype='object')
meta_data_full['ACVD status'].value_counts()
1 214 0 170 Name: ACVD status, dtype: int64
np.any(np.isnan(meta_data_full['ACVD status'].to_numpy()))
False
meta_data_full['Gender'].value_counts()
male 225 female 154 Name: Gender, dtype: int64
# not enough data for smoking status, so we omit it
meta_data_full['Smoking'].value_counts()
no 22 smoking 11 Name: Smoking, dtype: int64
# meta_data_input = meta_data_full.loc[:, ['Gender', 'Age (year)', 'Body Mass Index (BMI)']]
meta_data_input = meta_data_full.loc[:, ['Gender']]
gender_vect = np.reshape(meta_data_input['Gender'].to_numpy().astype('str'), (meta_data_input.shape[0], 1))
gender_encoder = OneHotEncoder(sparse=False)
gender_encoded = gender_encoder.fit_transform(gender_vect)
print(gender_encoded.shape)
(384, 3)
print(gender_encoder.categories_)
[array(['female', 'male', 'nan'], dtype='<U6')]
gender_encoded[gender_vect.flatten() == 'nan', np.logical_not(gender_encoder.categories_ == 'nan')] = np.nan
gender_encoded = np.delete(gender_encoded, 2, 1)
# 1 for gender encoded = male. 0 = female. nan means no data
gender_encoded = np.delete(gender_encoded, 0, 1)
# meta_data_mat = meta_data_input.loc[:, ['Age (year)', 'Body Mass Index (BMI)']].to_numpy().astype('float32')
# meta_data_mat = np.concatenate([meta_data_mat, gender_encoded], axis = 1)
meta_data_mat = gender_encoded
meta_data_mat.shape
(384, 1)
np.sum(np.isnan(meta_data_mat))
5
# 5 values missing for sex
for i in range(meta_data_mat.shape[1]):
print(np.sum(np.isnan(meta_data_mat[:, i])))
5
# we remove all samples for which no clinical data exists
samples_keep = [np.all(np.logical_not(np.isnan(meta_data_mat[i, :]))) for i in range(meta_data_mat.shape[0])]
meta_data_mat = meta_data_mat[samples_keep, :]
X = X[samples_keep, :]
meta_data_subs = meta_data_full.iloc[samples_keep, :]
meta_data_mat.shape
(379, 1)
y = meta_data_full['ACVD status'].to_numpy().astype('int32')[samples_keep]
y.shape
(379,)
if overwrite:
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
meta_data_subs.to_csv(os.path.join(output_dir, 'metadata_samples_keep.csv'))
feat_meta.to_csv(os.path.join(output_dir, 'feat_meta.csv'))
X_outfile = open(os.path.join(output_dir, 'X.pk'),'wb')
pk.dump(X, X_outfile)
X_outfile.close()
y_outfile = open(os.path.join(output_dir, 'y.pk'), 'wb')
pk.dump(y, y_outfile)
y_outfile.close()
meta_data_mat_outfile = open(os.path.join(output_dir, 'meta_data_mat.pk'), 'wb')
pk.dump(meta_data_mat, meta_data_mat_outfile)
meta_data_mat_outfile.close()